{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib \n",
    "from joblib import Parallel, delayed\n",
    "from pyentrp import entropy as ent\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_ent_basis(ser, col):\n",
    "    \"\"\"\n",
    "        描述：\n",
    "            计算信息熵的方法\n",
    "    \"\"\"\n",
    "\n",
    "    uniq_value_ser = ser.value_counts()\n",
    "    ser_len = len(ser)\n",
    "    ent = 0.0\n",
    "    for val_cnt in uniq_value_ser:\n",
    "        p = val_cnt / ser_len\n",
    "        logp = np.log2(p)\n",
    "        ent -= p * logp\n",
    "        print(ent)\n",
    "    return {col: ent}\n",
    "def calc_ent_parallel(df):\n",
    "    '''\n",
    "    描述：\n",
    "        并行计算\n",
    "    '''\n",
    "    ent_list = Parallel(n_jobs=24,verbose=1)(delayed(calc_ent_basis)(df.loc[:,col], col) for col in df.columns)\n",
    "    ent_dict = {}\n",
    "    for item in ent_list:\n",
    "        ent_dict.update(item)\n",
    "    return ent_dict\n",
    "\n",
    "def ent_filter(df, ent_thred=10):\n",
    "    '''\n",
    "    描述：\n",
    "        过滤信息熵小于ent_thred的变量\n",
    "    '''\n",
    "    ent_dict = calc_ent_parallel(df)\n",
    "    ent_ser = pd.Series(ent_dict)\n",
    "    low_ent_cols = list(ent_ser[ent_ser <= ent_thred].index)\n",
    "    ent_cols = [col for col in df.columns if col not in low_ent_cols]\n",
    "    return df[ent_cols], ent_dict\n",
    "\n",
    "\n",
    "def calc_permutation_ent_basis(order ,ser, col):\n",
    "    '''\n",
    "    描述：\n",
    "        计算序列order阶排序熵\n",
    "    '''\n",
    "    pe = ent.permutation_entropy(ser.values, order=order, delay=20000, normalize=True)\n",
    "    return {col: pe}\n",
    "\n",
    "def calc_permutation_ent_parallel(df):\n",
    "    '''\n",
    "    描述：\n",
    "        并行计算排序熵\n",
    "    '''\n",
    "    pe_list = Parallel(n_jobs=48,verbose=1)(delayed(calc_permutation_ent_basis)(2, df.loc[:,col], col) for col in df.columns)\n",
    "    pe_dict = {}\n",
    "    for item in pe_list:\n",
    "        pe_dict.update(item)\n",
    "    return pe_dict\n",
    "\n",
    "def pe_filter(df, pe_thred=0.999):\n",
    "    '''\n",
    "    描述：\n",
    "        过滤排列熵大于等于阈值的变量\n",
    "    '''\n",
    "    \n",
    "    pe_dict = calc_permutation_ent_parallel(df)\n",
    "    pe_ser = pd.Series(pe_dict)\n",
    "    high_pe_cols = list(pe_ser[pe_ser >= pe_thred].index)\n",
    "    pe_cols = [col for col in df.columns if col not in high_pe_cols]\n",
    "    return df[pe_cols], pe_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_train(idx):\n",
    "    '''\n",
    "    描述：\n",
    "        加载train, 返回x_train, y_train, 格式为DataFrame\n",
    "    '''\n",
    "    input_dir = '../concats_cut/'\n",
    "    train = joblib.load(input_dir + 'concat_0%d.lz4'%idx)\n",
    "    y_train = train[['RULR']]\n",
    "    x_train = train.drop(columns=['RULR'])\n",
    "    return x_train, y_train\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    3.5s remaining:    0.8s\n",
      "[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    3.7s finished\n",
      "[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.\n",
      "[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.4s remaining:    0.3s\n",
      "[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.6s finished\n"
     ]
    }
   ],
   "source": [
    "if not os.path.exists('./train_data'):\n",
    "    os.mkdir('./train_data')\n",
    "x_train_01, y_train_01 = load_train(1)\n",
    "x_train_01_ent, ent_dict_01 = ent_filter(x_train_01, 1)\n",
    "x_train_01_pe, pe_dict_01 = pe_filter(x_train_01_ent, 1)\n",
    "# train_01 = pd.concat([x_train_01_pe,y_train_01], axis=1)\n",
    "# joblib.dump(train_01, './train_data/train_01.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CL', 'CLI', 'spindle_load', 'Current_1__abs_energy',\n",
       "       'Current_1__kurtosis', 'Current_1__length', 'Current_1__mean',\n",
       "       'Current_1__mean_abs_change', 'Current_1__mean_change',\n",
       "       'Current_1__skewness', 'Current_1__variance', 'Vibration_1__abs_energy',\n",
       "       'Vibration_1__kurtosis', 'Vibration_1__length', 'Vibration_1__mean',\n",
       "       'Vibration_1__mean_abs_change', 'Vibration_1__mean_change',\n",
       "       'Vibration_1__skewness', 'Vibration_1__variance',\n",
       "       'Vibration_2__abs_energy', 'Vibration_2__kurtosis',\n",
       "       'Vibration_2__length', 'Vibration_2__mean',\n",
       "       'Vibration_2__mean_abs_change', 'Vibration_2__mean_change',\n",
       "       'Vibration_2__skewness', 'Vibration_2__variance',\n",
       "       'Vibration_3__abs_energy', 'Vibration_3__kurtosis',\n",
       "       'Vibration_3__length', 'Vibration_3__mean',\n",
       "       'Vibration_3__mean_abs_change', 'Vibration_3__mean_change',\n",
       "       'Vibration_3__skewness', 'Vibration_3__variance',\n",
       "       'Current_1__abs_energy_scale', 'Vibration_1__abs_energy_scale',\n",
       "       'Vibration_2__abs_energy_scale', 'Vibration_3__abs_energy_scale'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_01.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    2.1s remaining:    0.5s\n",
      "[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    2.4s finished\n",
      "[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.\n",
      "[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.3s remaining:    0.2s\n",
      "[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.5s finished\n"
     ]
    }
   ],
   "source": [
    "x_train_02, y_train_02 = load_train(2)\n",
    "x_train_02_ent, ent_dict_02 = ent_filter(x_train_02, 1)\n",
    "x_train_02_pe, pe_dict_02 = pe_filter(x_train_02_ent, 1)\n",
    "# train_02 = pd.concat([x_train_02_pe,y_train_02], axis=1)\n",
    "# joblib.dump(train_02, './train_data/train_02.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.\n",
      "[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    2.3s remaining:    0.5s\n",
      "[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    2.5s finished\n",
      "[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.\n",
      "[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.3s remaining:    0.2s\n",
      "[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.4s finished\n"
     ]
    }
   ],
   "source": [
    "x_train_03, y_train_03 = load_train(3)\n",
    "x_train_03_ent, ent_dict_03 = ent_filter(x_train_03, 1)\n",
    "x_train_03_pe, pe_dict_03 = pe_filter(x_train_03_ent, 1)\n",
    "# train_03 = pd.concat([x_train_03_pe,y_train_03], axis=1)\n",
    "# joblib.dump(train_03, './train_data/train_03.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据信息熵过滤的特征集合是一致的\n",
    "(list(x_train_01_pe.columns) == list(x_train_02_pe.columns)) & (list(x_train_02_pe.columns) == list(x_train_03_pe.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "use_cols = list(set(list(x_train_01_pe.columns) + list(x_train_02_pe.columns) + list(x_train_03_pe.columns)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./train_data/train_01.lz4']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_01, y_train_01 = load_train(1)\n",
    "train_01 = pd.concat([x_train_01[use_cols],y_train_01], axis=1)\n",
    "joblib.dump(train_01, './train_data/train_01.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./train_data/train_02.lz4']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_02, y_train_02 = load_train(2)\n",
    "train_02 = pd.concat([x_train_02[use_cols],y_train_02], axis=1)\n",
    "joblib.dump(train_02, './train_data/train_02.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./train_data/train_03.lz4']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_03, y_train_03 = load_train(3)\n",
    "train_03 = pd.concat([x_train_03[use_cols],y_train_03], axis=1)\n",
    "joblib.dump(train_03, './train_data/train_03.lz4', compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据信息熵过滤的特征集合是一致的\n",
    "(list(train_01.columns) == list(train_02.columns)) & (list(train_01.columns) == list(train_03.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75039, 40)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_03.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
